{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from ase.io import read\n",
    "from ase.visualize import view\n",
    "from itertools import product, chain\n",
    "from collections import defaultdict\n",
    "from copy import deepcopy\n",
    "from time import time\n",
    "from ase import Atom\n",
    "\n",
    "import warnings\n",
    "import helper\n",
    "import chemdata\n",
    "reload(chemdata)\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "from sklearn.decomposition import PCA, KernelPCA\n",
    "\n",
    "from sklearn.kernel_ridge import KernelRidge\n",
    "\n",
    "from sklearn.model_selection import ShuffleSplit, KFold, cross_val_predict\n",
    "\n",
    "from sklearn.neighbors import KNeighborsRegressor as KNR\n",
    "from sklearn.neighbors.kd_tree import KDTree\n",
    "\n",
    "from sklearn.preprocessing import OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from matplotlib import cm\n",
    "from matplotlib import gridspec\n",
    "import matplotlib.pyplot as plt\n",
    "from mpl_toolkits import mplot3d\n",
    "%matplotlib qt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 49 atoms in the structure\n",
      "There are 49 atoms in the reflection dictionary\n",
      "WARNING: Zr configuration at line 24080 is already included\n",
      "WARNING: Zr configuration at line 24132 is already included\n",
      "WARNING: Zr configuration at line 24184 is already included\n",
      "WARNING: Zr configuration at line 24236 is already included\n",
      "WARNING: Zr configuration at line 24288 is already included\n",
      "WARNING: Zr configuration at line 24340 is already included\n",
      "WARNING: Zr configuration at line 24392 is already included\n",
      "WARNING: Zr configuration at line 24444 is already included\n",
      "WARNING: Zr configuration at line 24496 is already included\n",
      "WARNING: Zr configuration at line 24548 is already included\n",
      "WARNING: Zr configuration at line 24600 is already included\n",
      "WARNING: Zr configuration at line 24652 is already included\n",
      "WARNING: Zr configuration at line 24704 is already included\n",
      "WARNING: Zr configuration at line 24756 is already included\n",
      "WARNING: Zr configuration at line 24808 is already included\n",
      "WARNING: Zr configuration at line 24860 is already included\n",
      "WARNING: Zr configuration at line 24912 is already included\n",
      "WARNING: Zr configuration at line 24964 is already included\n",
      "WARNING: Zr configuration at line 25016 is already included\n",
      "WARNING: Zr configuration at line 25068 is already included\n",
      "WARNING: Zr configuration at line 25120 is already included\n",
      "WARNING: Zr configuration at line 23617 is already included\n",
      "WARNING: Zr configuration at line 23668 is already included\n",
      "WARNING: Zr configuration at line 23719 is already included\n",
      "WARNING: Zr configuration at line 23770 is already included\n",
      "WARNING: Zr configuration at line 23821 is already included\n",
      "WARNING: Zr configuration at line 23872 is already included\n",
      "WARNING: Zr configuration at line 23923 is already included\n",
      "WARNING: Zr configuration at line 23974 is already included\n",
      "WARNING: Zr configuration at line 24025 is already included\n",
      "WARNING: Zr configuration at line 24076 is already included\n",
      "WARNING: Zr configuration at line 24127 is already included\n",
      "WARNING: Zr configuration at line 24178 is already included\n",
      "WARNING: Zr configuration at line 24229 is already included\n",
      "WARNING: Zr configuration at line 24280 is already included\n",
      "WARNING: Zr configuration at line 24331 is already included\n",
      "WARNING: Zr configuration at line 24382 is already included\n",
      "WARNING: Zr configuration at line 24433 is already included\n",
      "WARNING: Zr configuration at line 24484 is already included\n",
      "WARNING: Zr configuration at line 24535 is already included\n",
      "WARNING: Zr configuration at line 24586 is already included\n",
      "WARNING: Zr configuration at line 24637 is already included\n",
      "\n",
      "Found 6740 data points for adsorption\n",
      "Found 6943 data points for formation\n",
      "Found 5457 adsorption data points after canonicalizing.\n",
      "Found 5619 adsorption data points after canonicalizing.\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ti')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ti')\n",
      "Ti 15.9631734625 0.000624404999997\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'V')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'V')\n",
      "V 17.3954106925 0.000603040000001\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cr')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cr')\n",
      "Cr 18.4421536175 0.001194795\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mn')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mn')\n",
      "Mn 18.1227963075 0.000185204999994\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Fe')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Fe')\n",
      "Fe 17.5668675975 0.00218703\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Co')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Co')\n",
      "Co 16.7571184775 0.00203148\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ni')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ni')\n",
      "Ni 15.94083164 3.36100000036e-05\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cu')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cu')\n",
      "Cu 15.3278573075 0.000468084999994\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zn')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zn')\n",
      "Zn 14.8336656875 0.000342244999999\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zr')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zr')\n",
      "Zr 15.6203793975 0.000105975000012\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mo')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mo')\n",
      "Mo 18.3786345425 4.92499999893e-06\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ru')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ru')\n",
      "Ru 17.2741843325 0.000715560000003\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Rh')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Rh')\n",
      "Rh 15.7767561275 0.000776280000012\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pd')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pd')\n",
      "Pd 14.7718612175 0.000369774999996\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cd')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cd')\n",
      "Cd 14.8886779575 1.99199999997e-05\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Hf')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Hf')\n",
      "Hf 15.6272584525 8.02499999963e-05\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ta')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ta')\n",
      "Ta 17.4131288175 0.000116899999995\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'W')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'W')\n",
      "W 18.8810172925 1.12599999937e-05\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Re')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Re')\n",
      "Re 18.7999347725 0.000695395000008\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Found no values for Os\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ir')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ir')\n",
      "Ir 16.402621105 0.000856980000009\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pt')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pt')\n",
      "Pt 15.00941334 5.84799999928e-05\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Au')\n",
      "Missing bulk-like energy:  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Au')\n",
      "Au 14.8627756675 5.87350000103e-05\n",
      "\n",
      "----- starting in-bulk dimer formation energy ------\n",
      "\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ti')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ti')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ti')\n",
      "Ti -0.34762557 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'V')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'V')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'V')\n",
      "V -1.67438444 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cr')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cr')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cr')\n",
      "Cr -2.59312709 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mn')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mn')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mn')\n",
      "Mn -1.80970456 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Fe')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Fe')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Fe')\n",
      "Fe -0.77035489 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Co')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Co')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Co')\n",
      "Co -0.17578783 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ni')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ni')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ni')\n",
      "Ni -0.07013369 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cu')\n",
      "Cu -0.0151565833333 0.00264600546547\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zn')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zn')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zn')\n",
      "Zn 0.1335683 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zr')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zr')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Zr')\n",
      "Zr -0.11071895 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mo')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mo')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Mo')\n",
      "Mo -2.35570358 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ru')\n",
      "Ru -0.707318406667 0.000501976932887\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Rh')\n",
      "Rh -0.000932333333319 0.0007567946202\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pd')\n",
      "Pd 0.0123681166667 0.00168137662599\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Cd')\n",
      "Cd 0.144874753333 0.00105323738813\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Hf')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Hf')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Hf')\n",
      "Hf -0.03953253 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ta')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ta')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ta')\n",
      "Ta -1.17104923 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'W')\n",
      "W -2.26604572333 0.00035387780286\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Re')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Re')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Re')\n",
      "Re -2.35204749 0.0\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing index arrangement (30,29):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing index arrangement (29,31):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Missing index arrangement (31,28):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Os')\n",
      "Found no values for Os\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Ir')\n",
      "Ir -0.20349172 0.00155834956684\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Pt')\n",
      "Pt -0.01172499 0.00146951059902\n",
      "Missing index arrangement (28,30):  ((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 'Au')\n",
      "Au 0.01879836 0.000128838406028\n",
      "(31, 31)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:306: RuntimeWarning: divide by zero encountered in divide\n",
      "/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:315: RuntimeWarning: divide by zero encountered in power\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.\n",
      "  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.\n",
      "  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]\n",
      "Number of features: 169\n",
      "0 \t16\n",
      "1 \t17\n",
      "2 \t18\n",
      "3 \t20\n",
      "4 \t21\n",
      "5 \t22\n",
      "6 \t23\n",
      "7 \t24\n",
      "8 \t25\n",
      "9 \t26\n",
      "10 \t27\n",
      "11 \t28\n",
      "12 \t29\n",
      "13 \t30\n",
      "14 \t31\n",
      "15 \t32\n",
      "16 \t33\n",
      "17 \t34\n",
      "18 \t35\n",
      "19 \t36\n",
      "20 \t37\n",
      "21 \t38\n",
      "22 \t39\n",
      "23 \t40\n",
      "24 \t41\n",
      "25 \t42\n",
      "26 \t43\n",
      "27 \t44\n",
      "28 \t45\n",
      "29 \t46\n",
      "30 \t47\n",
      "31 \tnum1NN\n",
      "32 \tnum2NN\n",
      "33 \tnum3NN\n",
      "34 \tnumRest\n",
      "35 \tHMproximity\n",
      "36 \tMMproximity\n",
      "37 \tspread\n",
      "38 \tstep_dimer\n",
      "39 \tIL_dimer\n",
      "40 \tsurf_dimer\n",
      "41 \tXstep_dimer\n",
      "42 \tstepNN\n",
      "43 \tdimer_E\n",
      "44 \tDvM_Eads\n",
      "45 \tnumM\n",
      "46 \tEF_isolated\n",
      "47 \tpauling\n",
      "48 \tcov_radius\n",
      "49 \tdensity\n",
      "50 \tH_fusion\n",
      "51 \tion_energy\n",
      "52 \tbulk_modulus\n",
      "53 \tpoisson_ratio\n",
      "54 \tconductivity\n",
      "55 \td_energy\n",
      "56 \td_radius\n",
      "57 \tis_hcp\n",
      "58 \tis_bcc\n",
      "59 \trow\n",
      "60 \tcol\n",
      "61 \tbulk_E\n",
      "62 \tmono_E\n",
      "63 \tmoment_40_1\n",
      "64 \tmoment_40_2\n",
      "65 \tmoment_40_3\n",
      "66 \tmoment_40_4\n",
      "67 \tmoment_40_5\n",
      "68 \tmoment_45_1\n",
      "69 \tmoment_45_2\n",
      "70 \tmoment_45_3\n",
      "71 \tmoment_45_4\n",
      "72 \tmoment_45_5\n",
      "73 \tmoment_46_1\n",
      "74 \tmoment_46_2\n",
      "75 \tmoment_46_3\n",
      "76 \tmoment_46_4\n",
      "77 \tmoment_46_5\n",
      "78 \tTBe_1\n",
      "79 \tTBe_2\n",
      "80 \tTBe_3\n",
      "81 \tTBe_4\n",
      "82 \tTBe_5\n",
      "83 \tTBe_6\n",
      "84 \tTBe_7\n",
      "85 \tTBe_8\n",
      "86 \tTBe_9\n",
      "87 \tTBe_10\n",
      "88 \tTBe_11\n",
      "89 \tTBe_12\n",
      "90 \tTBe_13\n",
      "91 \tTBe_14\n",
      "92 \tTBe_15\n",
      "93 \tTBe_16\n",
      "94 \tTBe_17\n",
      "95 \tTBe_18\n",
      "96 \tTBe_19\n",
      "97 \tTBe_20\n",
      "98 \tTBe_21\n",
      "99 \tTBe_22\n",
      "100 \tTBe_23\n",
      "101 \tTBe_24\n",
      "102 \tTBe_25\n",
      "103 \tTBe_26\n",
      "104 \tTBe_27\n",
      "105 \tTBe_28\n",
      "106 \tTBe_29\n",
      "107 \tTBe_30\n",
      "108 \tTBe_31\n",
      "109 \tTBe_32\n",
      "110 \tTBe_33\n",
      "111 \tTBe_34\n",
      "112 \tTBe_35\n",
      "113 \tTBe_36\n",
      "114 \tTBe_37\n",
      "115 \tTBe_38\n",
      "116 \tTBe_39\n",
      "117 \tTBe_40\n",
      "118 \tTBe_41\n",
      "119 \tTBe_42\n",
      "120 \tTBe_43\n",
      "121 \tTBe_44\n",
      "122 \tTBe_45\n",
      "123 \tTBe_46\n",
      "124 \tTBe_47\n",
      "125 \tTBe_48\n",
      "126 \tC_0\n",
      "127 \tC_1\n",
      "128 \tC_2\n",
      "129 \tC_3\n",
      "130 \tC_4\n",
      "131 \tC_5\n",
      "132 \tC_6\n",
      "133 \tC_7\n",
      "134 \tC_8\n",
      "135 \tC_9\n",
      "136 \tC_10\n",
      "137 \tC_11\n",
      "138 \tC_12\n",
      "139 \tC_13\n",
      "140 \tC_14\n",
      "141 \tC_15\n",
      "142 \tC_16\n",
      "143 \tC_17\n",
      "144 \tC_18\n",
      "145 \tC_19\n",
      "146 \tC_20\n",
      "147 \tC_21\n",
      "148 \tC_22\n",
      "149 \tC_23\n",
      "150 \tC_24\n",
      "151 \tC_25\n",
      "152 \tC_26\n",
      "153 \tC_27\n",
      "154 \tC_28\n",
      "155 \tC_29\n",
      "156 \tC_30\n",
      "157 \tiso_1\n",
      "158 \tiso_2\n",
      "159 \tiso_3\n",
      "160 \tiso_4\n",
      "161 \tiso_5\n",
      "162 \tiso_6\n",
      "163 \td_ed_1\n",
      "164 \td_ed_2\n",
      "165 \td_ed_3\n",
      "166 \td_wf_1\n",
      "167 \td_wf_2\n",
      "168 \td_wf_3\n",
      "47\n",
      "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 59, 60]\n",
      "Reading adsorption structures\n",
      "6770 files in path\n",
      "0.00% done\n",
      "7.39% done\n",
      "14.77% done\n",
      "22.16% done\n",
      "29.54% done\n",
      "36.93% done\n",
      "44.31% done\n",
      "51.70% done\n",
      "59.08% done\n",
      "66.47% done\n",
      "73.86% done\n",
      "81.24% done\n",
      "88.63% done\n",
      "96.01% done\n",
      "Found 6762 formation energy structures\n",
      "Reading slab (formation) structures\n",
      "6972 files in path\n",
      "0.00% done\n",
      "7.17% done\n",
      "14.34% done\n",
      "21.51% done\n",
      "28.69% done\n",
      "35.86% done\n",
      "43.03% done\n",
      "50.20% done\n",
      "57.37% done\n",
      "64.54% done\n",
      "71.72% done\n",
      "78.89% done\n",
      "86.06% done\n",
      "93.23% done\n",
      "169\n",
      "169\n",
      "KMeans clustering on features (35,37) using 100 clusters\n",
      "\n",
      "Found 65 unique clusters after merging\n",
      "\n",
      "--- generated corresponding one-hot coding ---\n",
      "\n",
      "Processing adsorption structures\n",
      "14.79% done\n",
      "29.58% done\n",
      "44.37% done\n",
      "59.15% done\n",
      "73.94% done\n",
      "88.73% done\n",
      "Processing slab (formation) structures\n",
      "14.79% done\n",
      "29.58% done\n",
      "44.37% done\n",
      "59.15% done\n",
      "73.94% done\n",
      "88.73% done\n",
      "--- COMPLETE ---\n",
      "CPU times: user 2min 40s, sys: 3min 26s, total: 6min 7s\n",
      "Wall time: 4min 12s\n",
      "CPU times: user 2min 30s, sys: 2min 44s, total: 5min 15s\n",
      "Wall time: 2min 51s\n",
      "CPU times: user 2min 32s, sys: 2min 56s, total: 5min 28s\n",
      "Wall time: 3min 9s\n",
      "CPU times: user 148 ms, sys: 8 ms, total: 156 ms\n",
      "Wall time: 127 ms\n",
      "CPU times: user 68 ms, sys: 12 ms, total: 80 ms\n",
      "Wall time: 81.7 ms\n",
      "CPU times: user 9.63 s, sys: 13.6 s, total: 23.2 s\n",
      "Wall time: 16.5 s\n",
      "CPU times: user 10.1 s, sys: 14.8 s, total: 24.8 s\n",
      "Wall time: 17.3 s\n",
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 1 ms\n",
      "CPU times: user 7.6 s, sys: 11.3 s, total: 18.9 s\n",
      "Wall time: 13.2 s\n"
     ]
    }
   ],
   "source": [
    "nearest = [40,45,46]\n",
    "\n",
    "def cvp(model,X,y,nj=4):\n",
    "    return cross_val_predict(model,X,y,cv=KFold(n_splits=4,shuffle=True),n_jobs=nj)\n",
    "\n",
    "def cvs(model,X,y,nj=4):\n",
    "    return 1000.0*rmse(cvp(model,X,y),y)\n",
    "\n",
    "# Given Xforest, a matrix of data points (e.g. Xforest[0] are values for the first point),\n",
    "# and a trained forest, construct the forest's kernel matrix for the points in Xforest \n",
    "def get_forest_kernel_faster(Xforest,forest,update_interval=500):\n",
    "    L = eforest.apply(Xforest)\n",
    "    S = np.empty_like(L,dtype=np.float)\n",
    "\n",
    "    for t in xrange(L.shape[1]):\n",
    "        S[:,t] = 1.0*np.bincount(L[:,t])[L[:,t]]\n",
    "\n",
    "    K = np.zeros((L.shape[0],L.shape[0]))\n",
    "    told = time()\n",
    "\n",
    "    for i in xrange(K.shape[0]):\n",
    "        if update_interval and not i%int(update_interval):\n",
    "            print i, time()-told\n",
    "            told=time()\n",
    "        \n",
    "        K[i,i:] = np.sum((L[i,:]==L[i:,:])/S[i,:],axis=1)\n",
    "        K[i:,i] = K[i,i:]\n",
    "        \n",
    "    return K\n",
    "\n",
    "# Returns the dopant symbol for the given ASE structure\n",
    "def symbol_from_structure(atoms_object):\n",
    "    for atom in atoms_object:\n",
    "        if atom.symbol not in ['H','Ag']:\n",
    "            return atom.symbol\n",
    "        \n",
    "    assert False, \"ERROR: Structure had no dopant element!\"\n",
    "\n",
    "# Returns the key, used in the dictionaries below, for the given structure\n",
    "# Keys are (one_hot_vector,symbol)\n",
    "def structure_to_key(atoms_object):\n",
    "    if atoms_object[0].symbol == 'H':\n",
    "        start = 1\n",
    "    else:\n",
    "        start = 0\n",
    "        \n",
    "    one_hot = [(atom.symbol not in ['H','Ag']) for atom in atoms_object[start:]]\n",
    "    one_hot = tuple(map(lambda b: 1 if b else 0, one_hot))\n",
    "    \n",
    "    symbol = atoms_object[one_hot.index(1)].symbol\n",
    "    \n",
    "    return (one_hot,symbol)\n",
    "    \n",
    "# Read the Ag(211) structure and get its lattice vectors. cell[J] is the J^th lattice vector\n",
    "s = read('template.vasp')\n",
    "cell = s.get_cell().copy()\n",
    "\n",
    "# Map metal atoms according to boundary conditions to minimize their distance to hydrogen\n",
    "# Then construct a dictionary such that j==reflect[i] means sites i and j are symmetrically equivalent\n",
    "helper.contract_around_atom(s,0)\n",
    "stepdir = s.positions[7] - s.positions[6]\n",
    "reflector = helper.reflect(stepdir,s.positions[0])\n",
    "reflect = helper.reflection_map(s,reflector)\n",
    "\n",
    "distances = s.get_distances(0,range(1,len(s)))\n",
    "distances2 = s.get_distances(0,range(1,len(s)),mic=True)\n",
    "\n",
    "# clean up cases where atoms are on the cell boundary\n",
    "# in these cases, the reflected position differs by a\n",
    "# lattice constant relative to the original position\n",
    "for i in xrange(len(s)):\n",
    "    if i not in reflect: # try cell vector displacements\n",
    "        p = s.positions[i]\n",
    "        for m,vector in product((-1,1),cell):\n",
    "            if np.linalg.norm(reflector(p)-p-m*vector) < 0.1:\n",
    "                reflect[i] = i # atom is symmetric to itself via displacement\n",
    "                print \"Atom %d is symmetric to itself (lattice vector displacement)\"%i                \n",
    "print\n",
    "print \"Found %d atoms in the structure\" % len(s)\n",
    "print \"There are %d atoms in the reflection dictionary\" % len(reflect)\n",
    "if len(s) != len(reflect):\n",
    "    print\n",
    "    print \"Warning: some atoms do not have reflection symmetry\"\n",
    "\n",
    "# Parse the one and two atom results\n",
    "# Parsing multi-atom results may break, I changed some things around\n",
    "Eads_dir = 'newest_structures/'\n",
    "one_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords1Dopant.txt')\n",
    "two_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords2Dopant.txt')\n",
    "three_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords3Dopant.txt')\n",
    "four_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords4Dopant.txt')\n",
    "five_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords5Dopant.txt')\n",
    "six_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords6Dopant.txt')\n",
    "seven_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords7Dopant.txt')\n",
    "eight_atom_results = helper.parse_adsEnDB(Eads_dir+'adsEnDBRelCoords8Dopant.txt')\n",
    "\n",
    "# Easy way to change whether one, two, ... etc atom cases are included\n",
    "# (comment out unwanted data sources)\n",
    "datasources = (\n",
    "    ('1Dopant',one_atom_results),\n",
    "    ('2Dopant',two_atom_results),\n",
    "    ('3Dopant',three_atom_results),\n",
    "    ('4Dopant',four_atom_results),\n",
    "    ('5Dopant',five_atom_results),\n",
    "    ('6Dopant',six_atom_results),\n",
    "    ('7Dopant',seven_atom_results),\n",
    "    ('8Dopant',eight_atom_results)\n",
    ")\n",
    "\n",
    "# equivalent block to specify which formation energies will be used\n",
    "formation_sources = list()\n",
    "for n in xrange(1,9): ########################################\n",
    "    formation_sources.append(('formation_%d'%n,helper.parse_adsEnDB('newest_formation_structures/formEnSlabDBRelCoords%dDopant.txt'%n)))\n",
    "\n",
    "# Put all adsorption data into a dictionary\n",
    "alldata = dict()\n",
    "for name,datasource in datasources:\n",
    "    for key,value in datasource.iteritems():\n",
    "        if key in alldata:\n",
    "            print \"Warning:\",key,\"already exists\"\n",
    "        newvalue = [name]\n",
    "        newvalue.extend(value)\n",
    "        alldata[key]=newvalue\n",
    "\n",
    "# put all formation data into a dictionary\n",
    "formdata = dict()\n",
    "for name, datasource in formation_sources:\n",
    "    for key,value in datasource.iteritems():\n",
    "        if key in formdata:\n",
    "            print \"Warning:\",key,\"already exists\"\n",
    "        newvalue=[name]\n",
    "        newvalue.extend(value)\n",
    "        formdata[key]=newvalue\n",
    "        \n",
    "print\n",
    "print \"Found %d data points for adsorption\" % len(alldata)\n",
    "print \"Found %d data points for formation\" % len(formdata)\n",
    "\n",
    "# Returns the canonical value for one-hot vectors, such that\n",
    "# you can use either the lower or higher value\n",
    "def canonical_value(one_hot_vector,bias46=True):\n",
    "    # Returns a canonical value so that comparisons between symmetric\n",
    "    # configurations return the same ordering of bits\n",
    "    #\n",
    "    # NB: this requires 0's and 1's in the one-hot vector\n",
    "    #\n",
    "    # WARNING: values are length-dependent, don't try to compare one-hots\n",
    "    #          of differing lengths with this method\n",
    "    #\n",
    "    # Interprets the one-hot vector as a binary number, reverses the digit\n",
    "    # order, then returns the integer value. This effectively sorts\n",
    "    # one-hot vectors by least significant bit\n",
    "    #\n",
    "    # Bias 46: adds (2**100)*one_hot_vector[46]. Atom 46 in some ways\n",
    "    #          is the most important atom, so it may be beneficial to\n",
    "    #          make this *the* canonical bit (otherwise atom 47 takes\n",
    "    #          precedence even though it's not a nearest neighbor!)\n",
    "    result = 0\n",
    "    for i,value in enumerate(one_hot_vector):\n",
    "        result += value*(2**i)\n",
    "    \n",
    "    if bias46: # atom 46 in some ways is most important, puts\n",
    "        return result+(2**100)*one_hot_vector[46]\n",
    "    \n",
    "    return result\n",
    "\n",
    "\n",
    "#### WARNING: Will need to change hard-coded offset to handle non-H \n",
    "####          adsorbates or adsorbates at different index positionsF\n",
    "def reflect_one_hot(vector):\n",
    "    swapped = [ 0 for i in xrange(len(one_hot)) ]\n",
    "    for srcM in xrange(len(one_hot)): # iterate over indices in metal-only one-hot\n",
    "        src = srcM + 1 # original indices are +1 relative to metal-only one-hot vectors (H is index 0!)\n",
    "        des = reflect[src] # reflect index\n",
    "        desM = des - 1 # metal-only indices are -1 relative to original indices (no H atom)\n",
    "        swapped[desM] = one_hot[srcM] # map value at src in metal-only one-hot to its destination\n",
    "        \n",
    "    return swapped\n",
    "        \n",
    "for data in (alldata,formdata):\n",
    "    olddata = deepcopy(data)\n",
    "    data.clear()\n",
    "\n",
    "    for (one_hot,symbol),values in olddata.iteritems():\n",
    "        swapped = reflect_one_hot(one_hot)\n",
    "        if canonical_value(swapped) > canonical_value(one_hot):\n",
    "            data[(tuple(swapped),symbol)] = values\n",
    "        else:\n",
    "            data[(one_hot,symbol)] = values\n",
    "\n",
    "print \"Found %d adsorption data points after canonicalizing.\" % len(alldata)\n",
    "print \"Found %d adsorption data points after canonicalizing.\" % len(formdata)\n",
    "    \n",
    "\n",
    "# convenience function to get rmse values\n",
    "# NB: can use rmse(y_predict,y_known), or rmse(y_predict-y_known,0) (latter is useful for working with errors)\n",
    "def rmse(predicted,known):\n",
    "    return np.sqrt(np.mean(np.power(predicted-known,2.0)))\n",
    "\n",
    "\n",
    "# From the formation energy calculation for the Ag slab\n",
    "E_Ag_slab = chemdata.bulk_energies['Ag']*48.0 + 15.00\n",
    "\n",
    "# Get the energy of a single dopant in the \"bulk\" of these 211 slabs\n",
    "isolated_energies = defaultdict(float)\n",
    "bulk_like_indices = range(28,32) # as close to the bulk as you can get!\n",
    "\n",
    "already_warned = set()    \n",
    "for element in chemdata.elements:\n",
    "    if element != 'H' and element != 'Ag': # and element != 'Os':\n",
    "        fvals = list()\n",
    "        for position in bulk_like_indices:\n",
    "            onehot = map(lambda k: 1 if k==position else 0,xrange(48))\n",
    "            try:\n",
    "                fvals.append(formdata[(tuple(onehot),element)][2]*48)\n",
    "            except KeyError as err:\n",
    "                if element not in already_warned:\n",
    "                    print \"Missing bulk-like energy: \",err\n",
    "                else:\n",
    "                    already_warned.add(element)\n",
    "                    print \"Missing at least one bulk-like energy:\",err\n",
    "        \n",
    "        if len(fvals)>0:\n",
    "            isolated_energies[element] = np.mean(fvals)\n",
    "            print element, np.mean(fvals), np.std(fvals)    \n",
    "        else:\n",
    "            print \"Found no values for %s\"%element\n",
    "\n",
    "print\n",
    "print '----- starting in-bulk dimer formation energy ------'\n",
    "print\n",
    "\n",
    "# Similar loop, but gets the formation energies of dopant dimers in the \"bulk\" of the 211 slab\n",
    "bulk_like_dimer_indices = ((28,30),(30,29),(29,31),(31,28)) # pairs of indices for the dimers\n",
    "dimer_energies = defaultdict(float)\n",
    "\n",
    "already_warned = set()    \n",
    "for element in chemdata.elements:\n",
    "    if element not in ['H','Ag']:\n",
    "        dvals = list()\n",
    "        for i,j in bulk_like_dimer_indices:\n",
    "            onehot = map(lambda k: 1 if k==i or k==j else 0,xrange(48))\n",
    "            try:\n",
    "                dvals.append(formdata[(tuple(onehot),element)][2]*48 - 2*isolated_energies[element] + 15.00)\n",
    "            except KeyError as err:\n",
    "                if element not in already_warned:\n",
    "                    print \"Missing index arrangement (%d,%d): \"%(i,j),err\n",
    "                else:\n",
    "                    already_warned.add(element)\n",
    "                    print \"Missing at least one index arrangement: (%d,%d): \"%(i,j),err\n",
    "    \n",
    "        if len(dvals)>0:\n",
    "            dimer_energies[element] = np.mean(dvals)\n",
    "            print element, np.mean(dvals), np.std(dvals)\n",
    "        else:\n",
    "            print \"Found no values for %s\"%element\n",
    "            \n",
    "## Also a similar loop, but gets the adsorption energy \n",
    "## difference between a single step atom and a step dimer\n",
    "idx1 = 46 # metal-only indices (-1 relative to original structure indices)\n",
    "idx2 = 45\n",
    "dimer_vs_monomer_adsorption = defaultdict(float)\n",
    "\n",
    "already_warned = set()\n",
    "\n",
    "for element in chemdata.elements:\n",
    "    if element in ('Ag','H','Os'):\n",
    "        continue\n",
    "        \n",
    "    single_values = list()\n",
    "    for idx in (idx1,idx2):\n",
    "        onehot = [0 for i in xrange(48)]\n",
    "        onehot[idx] = 1\n",
    "        \n",
    "        key = (tuple(onehot),element)\n",
    "        if key in alldata:\n",
    "            single_values.append(alldata[key][2])\n",
    "            \n",
    "    assert len(single_values)>0, \"ERROR: Element %s has no step dopant configurations!\"%element\n",
    "    \n",
    "    onehot = [0 for i in xrange(48)]\n",
    "    onehot[idx1] = 1\n",
    "    onehot[idx2] = 1\n",
    "    key = (tuple(onehot),element)\n",
    "    if key in alldata:\n",
    "        dimer_value = alldata[key][2]\n",
    "        dimer_vs_monomer_adsorption[element] = dimer_value - np.mean(single_values)\n",
    "    else:\n",
    "        if element not in already_warned:\n",
    "            print \"WARNING: Element %s has no step dimer configuration!\"%element\n",
    "            print \"            (setting difference to -10. as a \\\"warning flag\\\")\"\n",
    "        else:\n",
    "            already_warned.add(element)\n",
    "            print \"WARNING: Element %s is missing at least one configuration (set legacy_debug=True for details)\"%element\n",
    "        \n",
    "D = s.get_all_distances(mic=True)\n",
    "include_coulomb_matrix = True\n",
    "\n",
    "Dmtx = s.get_all_distances(mic=True)[1:,1:]\n",
    "coulomb_range = 8.0 \n",
    "coulomb_indices = [i for i in xrange(len(s)-1) if distances[i] < coulomb_range]\n",
    "Dmtx = Dmtx[coulomb_indices,:]\n",
    "Dmtx = Dmtx[:,coulomb_indices]\n",
    "print Dmtx.shape\n",
    "Cindices = np.triu_indices_from(Dmtx) # upper triangular indices\n",
    "invD = 1.0/Dmtx # yields NaN on diagonal\n",
    "np.nan_to_num(invD) # diagonals now have 1e308\n",
    "invD[invD>1e300] = 0 # set diagonals to zero instead\n",
    "\n",
    "# Setup for tight-binding hamiltonian\n",
    "Dnn = s.get_all_distances(mic=True)\n",
    "Dnn = Dnn[1:,:]\n",
    "Dnn = Dnn[:,1:]\n",
    "Dnn[Dnn>3.5] = 0.\n",
    "invDnn5 = np.power(Dnn,-5.0)\n",
    "np.nan_to_num(invDnn5)\n",
    "invDnn5[invDnn5>1e300] = 0.\n",
    "\n",
    "original_indices = [i for i in range(len(s)-1) if distances[i]<8.0]\n",
    "old_to_new_index = dict()\n",
    "for i, original_index in enumerate(original_indices):\n",
    "    old_to_new_index[original_index] = i\n",
    "\n",
    "core8 = [35,37,39,40,42,43,45,46]\n",
    "dimers = list()\n",
    "trimers = list()\n",
    "for i,di in enumerate(distances):\n",
    "    for j,dj in enumerate(distances[:i]):\n",
    "        if i in core8 and j in core8 and s.get_distance(i+1,j+1)<3:\n",
    "            dimers.append((i,j))\n",
    "for i,di in enumerate(distances):\n",
    "    for j,dj in enumerate(distances[:i]):\n",
    "        for k,dk in enumerate(distances[:j]):\n",
    "            if i in core8 and j in core8 and k in core8 and s.get_distance(i+1,j+1)<3 and s.get_distance(i+1,k+1)<3 and s.get_distance(j+1,k+1)<3:\n",
    "                trimers.append((i,j,k))\n",
    "\n",
    "first = True # flags that this is our first pass\n",
    "features_values = defaultdict(list)\n",
    "features_form_values = defaultdict(list)\n",
    "for (one_hot,symbol),values in alldata.iteritems():\n",
    "    new_one_hot = [one_hot[i] for i in range(len(s)-1) if distances[i]<8.0]\n",
    "    num_1NN = sum([one_hot[i] for i in range(len(s)-1) if distances[i]<3.0])\n",
    "    num_2NN = sum([one_hot[i] for i in range(len(s)-1) if distances[i]<4.0]) - num_1NN\n",
    "    num_3NN = sum([one_hot[i] for i in range(len(s)-1) if distances[i]<5.0]) - num_1NN - num_2NN\n",
    "    num_rest = sum(one_hot) - num_1NN - num_2NN - num_3NN\n",
    "    \n",
    "    HMproximity = np.min(s.get_distances(0,[i+1 for i in range(len(s)-1) if one_hot[i]==1]))\n",
    "    if sum(one_hot)<2:\n",
    "        MMproximity = 10\n",
    "        spread = 10\n",
    "        step_dimer = 0\n",
    "    else:\n",
    "        MMproximity = np.min([D[i+1,j+1] for i,j in product(xrange(len(one_hot)),repeat=2) if one_hot[i]>0 and one_hot[j]>0 and i>j])\n",
    "        spread = np.sum(np.var([s.positions[i+1] for i in xrange(len(one_hot)) if one_hot[i]>0],axis=0))\n",
    "    \n",
    "        # pair of atoms along the step edge\n",
    "        step_dimer = 0\n",
    "        for i,j in ((47,45),(45,46),(46,44),(44,47)):\n",
    "            if one_hot[i] and one_hot[j]:\n",
    "                step_dimer = 1\n",
    "                break      \n",
    "                \n",
    "        # pair of atoms, one in surface and one in subsurface\n",
    "        interlayer_dimer = 0\n",
    "        for i,j in chain(product(xrange(36,48),xrange(24,36)),product((44,45,46,47),(36,37,38,39))):\n",
    "            if one_hot[i] and one_hot[j] and D[i+1,j+1] < 3.3:\n",
    "                interlayer_dimer = 1\n",
    "                break\n",
    "                \n",
    "        # pair of atoms in surface layer (partially redundant with step_dimer)\n",
    "        surface_dimer = 0\n",
    "        for i,j in product(xrange(40,48),repeat=2):\n",
    "            if i>j and one_hot[i] and one_hot[j] and D[i+1,j+1]<3.3:\n",
    "                surface_dimer=True\n",
    "                break\n",
    "    \n",
    "        cross_step_dimer = 0\n",
    "        for i,j in product((44,45,46,47),(36,37,38,39)):\n",
    "            if one_hot[i] and one_hot[j] and D[i+1,j+1]>=3.3 and D[i+1,j+1]<4.3:\n",
    "                cross_step_dimer = 1\n",
    "                break\n",
    "    \n",
    "    step_neighbors = 0\n",
    "    for index in (45,46):\n",
    "        step_neighbors += one_hot[index]\n",
    "    \n",
    "    chem_features = [prop_val[symbol] for (prop_name,prop_val) in chemdata.properties]\n",
    "    \n",
    "    if include_coulomb_matrix:\n",
    "        \n",
    "        # Include empirical tight-binding model... solving for eigenvectors is something that forests cannot do\n",
    "        qI = np.asarray([ b*(chemdata.d_radius[symbol]-chemdata.d_radius['Ag'])+chemdata.d_radius['Ag'] for b in one_hot])\n",
    "        Htb = 13*7.62*invDnn5*np.power(np.outer(qI,qI),3./2.)\n",
    "        Htb += np.diag([b*(-chemdata.d_energy[symbol]+chemdata.d_energy['Ag'])-chemdata.d_energy['Ag'] for b in one_hot])\n",
    "        evals,evecs = np.linalg.eigh(Htb)\n",
    "        # moments of energies weighted by state projections on each atom\n",
    "        moments = []\n",
    "        for neighbor in nearest:\n",
    "            mean = np.sum( (np.abs(evecs[neighbor,:])**2) * evals )\n",
    "            moments.append(mean)\n",
    "            for p in xrange(3+1):\n",
    "                moments.append(np.sum( (np.abs(evecs[neighbor,:])**2) * np.power(evals-mean,p+2) ))\n",
    "        top5 = evals[np.argsort(-evals)[:]] # FIXME: actually using all of them\n",
    "        if first:\n",
    "            print np.linalg.norm(evecs,axis=1)\n",
    "        \n",
    "        # Sum of NN \"isolation value\" of (e_I - e_J)/V ( in this case, w/o constants: delta_e/(r_I*r_J)**1.5 )\n",
    "        isolation_values = list()\n",
    "        for prop in (chemdata.d_energy,chemdata.work_functions): # use d_energy and work function for ed and ed_vals\n",
    "            neighbor_symbols = list()\n",
    "            ed_vals = np.asarray([b*(prop[symbol] - prop['Ag']) + prop['Ag'] for b in one_hot])\n",
    "            rd_vals = np.asarray([b*(chemdata.d_radius[symbol] - chemdata.d_radius['Ag']) + chemdata.d_radius['Ag'] for b in one_hot])\n",
    "            for neighbor in nearest:\n",
    "                if one_hot[neighbor]:\n",
    "                    ed = prop[symbol]\n",
    "                    rd = chemdata.d_radius[symbol]\n",
    "                    neighbor_symbols.append(symbol)\n",
    "                else:\n",
    "                    ed = prop['Ag']\n",
    "                    rd = chemdata.d_radius['Ag']\n",
    "                    neighbor_symbols.append('Ag')\n",
    " \n",
    "                isNN = Dnn[:,neighbor]>0 # is 1NN wrt each nearest neighbor\n",
    "                isolation_values.append( np.sum(isNN*(ed - ed_vals)/np.power(rd_vals*rd,3./2.)) )\n",
    "               \n",
    "        C = np.copy(invD) # construct C_{IJ} in-place, starting from inverse distances and 0 diagonal\n",
    "        prop = chemdata.col # use this property, relative to Ag, as the coulomb matrix charge\n",
    "        qI = np.asarray([b*(prop[symbol]-prop['Ag'])+prop['Ag'] for b in one_hot]) # col_I for each atom\n",
    "        qI = qI[coulomb_indices]\n",
    "        QIJ = np.outer(qI,qI) # Q_{IJ} = col_I * col_J\n",
    "        C *= QIJ # correct C_{IJ} for I != J\n",
    "        C += np.diag(np.power(qI,2.4)) # C_{IJ} complete\n",
    "        C = np.sum(C,axis=0) # overwrite with column sum\n",
    "        \n",
    "    # Structure features\n",
    "    features = list()\n",
    "    features.extend(new_one_hot)\n",
    "    features.extend([num_1NN,num_2NN,num_3NN,num_rest,HMproximity,MMproximity,spread,\n",
    "                     step_dimer,interlayer_dimer,surface_dimer,cross_step_dimer,step_neighbors,\n",
    "                     dimer_energies[symbol],dimer_vs_monomer_adsorption[symbol],sum(one_hot),\n",
    "                     isolated_energies[symbol]])\n",
    "\n",
    "    # Chemical features\n",
    "    features.extend(chem_features)\n",
    "   \n",
    "    if include_coulomb_matrix:\n",
    "        features.extend(moments)\n",
    "        features.extend(top5)\n",
    "        features.extend(C)\n",
    "        features.extend(isolation_values)\n",
    "        for prop in (chemdata.d_energy,chemdata.work_functions):\n",
    "            features.extend([prop[sym]-prop['Ag'] for sym in neighbor_symbols])\n",
    "    \n",
    "    # Use the same features for both formation and adsorption energies\n",
    "    features_values[tuple(features)].append(values)\n",
    "    features_form_values[tuple(features)].append(formdata[(one_hot,symbol)])\n",
    "    \n",
    "    \n",
    "    if first:\n",
    "        print \"Number of features:\",len(features)\n",
    "        \n",
    "    first = False\n",
    "\n",
    "feature_names = [str(i) for i in range(len(s)-1) if distances[i]<8.0]\n",
    "feature_names.extend(['num1NN','num2NN','num3NN','numRest','HMproximity','MMproximity','spread',\n",
    "                      'step_dimer','IL_dimer','surf_dimer','Xstep_dimer','stepNN',\n",
    "                      'dimer_E','DvM_Eads','numM','EF_isolated'])\n",
    "feature_names.extend([prop_name for (prop_name,prop_val) in chemdata.properties])\n",
    "\n",
    "if include_coulomb_matrix:\n",
    "    for neighbor in nearest:\n",
    "        feature_names.extend(['moment_%d_%d'%(neighbor,i+1) for i in xrange(5)])\n",
    "\n",
    "    feature_names.extend(['TBe_%d'%(i+1) for i in xrange(len(top5))])\n",
    "\n",
    "    feature_names.extend(['C_%d'%i for i in xrange(len(C))])\n",
    "\n",
    "    \n",
    "    feature_names.extend(['iso_%d'%(i+1) for i,v in enumerate(isolation_values)])\n",
    "         \n",
    "    for prop,proot in ((chemdata.d_energy,'ed'),(chemdata.work_functions,'wf')):\n",
    "        feature_names.extend(['d_%s_%d'%(proot,i+1) for i,sym in enumerate(neighbor_symbols)])\n",
    "    \n",
    "for i, name in enumerate(feature_names):\n",
    "    print i,\"\\t\",name\n",
    "\n",
    "stdvals = dict()\n",
    "for features,energies in features_values.iteritems():\n",
    "    if len(energies) > 1:\n",
    "        print \"Warning: feature has multiple values!\"\n",
    "        print \"stdev(energies) =\",np.std(map(lambda T:T[-1],energies))\n",
    "        stdvals[features] = np.std(map(lambda T:T[-1],energies))\n",
    "\n",
    "### a list of indices corresponding to the original features\n",
    "initial_features = []\n",
    "# Add one-hot bits\n",
    "for site in xrange(48):\n",
    "    name = '%02d'%site\n",
    "    if name in feature_names:\n",
    "        initial_features.append(feature_names.index(name))\n",
    "# add other features\n",
    "for name in ['num1NN','num2NN','num3NN','numRest',\n",
    "             'pauling','cov_radius','density','H_fusion','ion_energy','bulk_modulus',\n",
    "             'poisson_ratio','conductivity','d_energy','d_radius','row','col']:\n",
    "    initial_features.append(feature_names.index(name))\n",
    "    \n",
    "print len(initial_features)\n",
    "print initial_features\n",
    "\n",
    "\n",
    "def smallest_distance(structure,position=None):\n",
    "    \"\"\" Returns the smallest distance from any dopant atom in the structure to the given position.\n",
    "    \n",
    "        If position is None or not given, the position of the structure's 0th atom is used instead.\"\"\"\n",
    "    \n",
    "    if position is None:\n",
    "        p = structure.positions[0]\n",
    "    else:\n",
    "        p = position\n",
    "    distances = [(i, np.linalg.norm(structure.positions[i] - p)) for i in xrange(len(structure)) if structure[i].symbol not in ['H','Ag']]\n",
    "    k = np.argmin(distances,axis=0)\n",
    "    return distances[k[1]]\n",
    "\n",
    "def minimal_metal_distances(structure,reference,structure_offset=0,reference_offset=0):\n",
    "    \"\"\" Returns a vector of distances between atoms in the structure to their original\n",
    "        positions in the reference structure. structure[i] is assumed to be the same atom\n",
    "        as reference[i]. Distances are minimized with respect to periodic boundary\n",
    "        conditions, an atom at position r+R in one structure and at position r in the other\n",
    "        structure has 0 displacement when R is a linear combination of lattice vectors.\n",
    "        \n",
    "        If structure has more atoms than reference, e.g. comparing adsorbed vs slab geometries,\n",
    "        then structure[i+offset] is compared to reference[i] instead.\n",
    "        \n",
    "        Similarly, atoms can be skipped by setting reference_offset as well. Then\n",
    "        structure[i+structure_offset] is compared to reference[i+reference_offset].\n",
    "        \n",
    "        Example: calculate metal atom motion between the relaxed and input geometries:\n",
    "          minimal_metal_distances(struct_dict[key],input_geometry,1,1)\n",
    "        (setting both offsets to 1 skips the H atoms at index 0)\"\"\"\n",
    "    \n",
    "    if (len(structure)-structure_offset) != (len(reference)-reference_offset):\n",
    "        raise ValueError(\"ERROR: considering different numbers of atoms in each structure!\")\n",
    "    \n",
    "    newstruct = deepcopy(structure)\n",
    "    newstruct.extend(deepcopy(reference))\n",
    "    newstruct.set_cell(reference.get_cell())\n",
    "    \n",
    "    n = len(structure)\n",
    "    return np.asarray(\n",
    "        [newstruct.get_distance(i+structure_offset,i+reference_offset+n,mic=True) for i in xrange(n-structure_offset)])\n",
    "\n",
    "reload(helper)\n",
    "\n",
    "def wrap_structure(structure,position=None):\n",
    "    if position is None:\n",
    "        p = structure.positions[0] # Default to 0th atom\n",
    "    else:\n",
    "        p = position\n",
    "    for i in xrange(1,len(structure)):\n",
    "        results = [(m,n,np.linalg.norm(structure.positions[i]+m*cell[0]+n*cell[1]-p)) for m,n in product((-1,0,1),repeat=2)]\n",
    "        k = np.argmin(results,axis=0)\n",
    "        structure.positions[i] += results[k[2]][0]*cell[0] + results[k[2]][1]*cell[1]\n",
    "\n",
    "print \"Reading adsorption structures\"\n",
    "if 'struct_list' not in locals(): # Time-consuming; only do this once\n",
    "    struct_list = helper.read_structures_xyz('newest_structures/')\n",
    "    \n",
    "print \"Found %d formation energy structures\"%(len(struct_list))\n",
    "    \n",
    "print \"Reading slab (formation) structures\"\n",
    "if 'struct_form_list' not in locals():\n",
    "    struct_form_list = helper.read_structures_xyz('newest_formation_structures/')\n",
    "    \n",
    "cell = np.loadtxt('structures/cell')\n",
    "\n",
    "symmetrize_canonical = False # train and predict on sum of each\n",
    "                            # one-hot vector and its reflection\n",
    "                            # (this effectively symmetrizes the kernel)\n",
    "        \n",
    "one_hot_HM_MM_clusters = True\n",
    "\n",
    "new_HM_MM_clusters = False # Use KDTree approach instead of old k-means approach\n",
    "        \n",
    "if symmetrize_canonical and not canonicalize and symmetrize:\n",
    "    print \"WARNING: already training on symmetric configurations\"\n",
    "    print \"         symmetrizing points will lead to bias since duplicates\"\n",
    "    print \"         will yield identical points!\"\n",
    "\n",
    "use_dilute_chemical_potential = False # chemical potential is isolated M atom in Ag(211) subsurface\n",
    "H_adsorption_modifier = 0 # 0===do not include, 1===include 1 H, n===include n H's (only 2-3 makes sense?)\n",
    "Eads_Ag = 0.08 # reference value for Ag itself (Eads_Ag==0 yields Eads relative to 1/2 H_2)\n",
    "\n",
    "# Construct the feature matrix X and corresponding adsorption/formation energies y and yf, respectively\n",
    "# X[i] corresponds to y[i], and src[i] gives the \"source\" data for the i^th data point\n",
    "src = list()\n",
    "srcf = list()\n",
    "X = list()\n",
    "y = list() # adsorption energies\n",
    "yf = list() # formation energies\n",
    "for features,values in features_values.iteritems():\n",
    "    for v in values:\n",
    "        X.append(features)\n",
    "        y.append(v[2] - Eads_Ag)\n",
    "        src.append(v[:2])\n",
    "    for v in features_form_values[features]:\n",
    "        srcf.append(v[:2])\n",
    "        num_dopants = sum( X[-1][31:35] )\n",
    "        if use_dilute_chemical_potential:\n",
    "            element = symbol_from_structure(struct_dict[tuple(v[:2])])\n",
    "            yf.append(v[2]*48.0 + H_adsorption_modifier*(y[-1] - 0*Eads_Ag) + num_dopants*chemdata.bulk_energies[element]\n",
    "                      - num_dopants*(isolated_energies[element] - E_Ag_slab + chemdata.bulk_energies['Ag']))\n",
    "        else:\n",
    "            yf.append(v[2]*48 + H_adsorption_modifier*(y[-1] - 0*Eads_Ag))\n",
    "\n",
    "# Cast X to a numpy array, then symmetrize (one_hot + one_hot_reflected) if desired\n",
    "X = np.asarray(X)\n",
    "\n",
    "print X.shape[1]\n",
    "print len(feature_names)\n",
    "\n",
    "if symmetrize_canonical:\n",
    "    print\n",
    "    print \"WARNING: assuming there are 31 one-hot bits to symmetrize\"\n",
    "    print \"         and only one-hot elements 16 and onward are kept, in order\"\n",
    "    Xs = np.zeros((X.shape[0],31))\n",
    "    for src,des in reflect.iteritems(): # produce reflected one-hot features\n",
    "        assert not ((src-1 in original_indices)^(des-1 in original_indices)), \"ERROR: either src or des has no target in the new one-hot vector\"\n",
    "        if src-1 in original_indices:\n",
    "            Xs[:,old_to_new_index[des-1]] = X[:,old_to_new_index[src-1]]\n",
    "    assert all(X[:,:31].sum(axis=1)==Xs.sum(axis=1)), \"ERROR: some swaps changed the number of atoms!\"\n",
    "    # symmetrize through addition\n",
    "    X[:,:31] += Xs\n",
    "    del Xs\n",
    "\n",
    "# Convert H-metal proximity and spread in metal atom positions (standard deviation)\n",
    "# to one-hot-coded clusters\n",
    "debug = False\n",
    "nc = 100 # works well for 1-2 dopants, should re-check for 3+ dopants\n",
    "eps = 0.35 # point-point distance used for merging clusters, should also re-check for 3+ dopants\n",
    "fMM = feature_names.index('HMproximity')\n",
    "fHM = feature_names.index('spread')\n",
    "foo = np.copy(X[:,(fMM,fHM)])\n",
    "\n",
    "if not new_HM_MM_clusters:\n",
    "    print \"KMeans clustering on features (%d,%d) using %d clusters\"%(fMM,fHM,nc)\n",
    "    kmeans = KMeans(n_clusters=nc,precompute_distances=True,n_jobs=4,random_state=0)\n",
    "    labels = kmeans.fit_predict(foo)\n",
    "else: # KDTree\n",
    "    kdt = KDTree(foo,leaf_size=3)\n",
    "\n",
    "    labels = np.zeros(foo.shape[0],dtype=np.int32)\n",
    "    l=0\n",
    "    for node in kdt.node_data:\n",
    "        if node['is_leaf']:\n",
    "            labels[kdt.idx_array[node['idx_start']:node['idx_end']]]=l\n",
    "            l+=1\n",
    "    if debug:\n",
    "        print \"Found %d kd-tree leaves\"%l\n",
    "        \n",
    "    plt.scatter(foo[:,0],foo[:,1],c=leaf,edgecolor=[0,0,0,0.3],cmap='flag')\n",
    "\n",
    "# K-means clustering is too agressive; it cuts some clusters in half\n",
    "# This merges clusters if any two points are 'close enough' (see value of eps above)\n",
    "for i in xrange(len(labels)):\n",
    "    this_label = labels[i]\n",
    "    dists = np.linalg.norm(foo[i+1:]-foo[i],axis=1)\n",
    "    targets = (dists<=eps)*(labels[i+1:] != this_label)\n",
    "    for other_label in np.unique(labels[i+1:][targets]):\n",
    "        labels[labels==other_label] = this_label\n",
    "#        if debug:\n",
    "#            print \"Merged cluster %d into %d\"%(other_label,this_label)\n",
    "\n",
    "print\n",
    "print \"Found %d unique clusters after merging\"%len(np.unique(labels))\n",
    "\n",
    "# color-coded scatterplot and contour plot with matching colors to show clusters\n",
    "if debug:\n",
    "    plt.figure()\n",
    "    knr = KNR(n_neighbors=1)\n",
    "    knr.fit(foo,labels)\n",
    "    xx1,xx2 = np.meshgrid(np.linspace(0,13,130),np.linspace(0,30,300))\n",
    "    pts = np.c_[xx1.ravel(),xx2.ravel()]\n",
    "    plt.contourf(xx1,xx2,knr.predict(pts).reshape(xx1.shape),nc+1,alpha=0.6)\n",
    "    foo += np.random.normal(scale=eps/100,size=foo.shape)\n",
    "    plt.scatter(foo[:,0],foo[:,1],c=labels,edgecolors=[0,0,0,0.6])\n",
    "    indices = np.unique(labels)\n",
    "    populations = np.bincount(np.asarray(labels,dtype=np.int32))\n",
    "    #plt.title('%d'%nc).set_fontsize(20)\n",
    "    plt.gca().set_aspect(1)\n",
    "    for cluster in indices:\n",
    "        mu = np.mean(foo[labels==cluster],axis=0)\n",
    "        plt.text(mu[0],mu[1],'|%d|=%d'%(cluster,populations[cluster]))\n",
    "\n",
    "# Generate the one-hot coding of these clusters, then append to X\n",
    "ohe = OneHotEncoder()\n",
    "hots = ohe.fit_transform(labels.reshape(-1,1))\n",
    "\n",
    "print\n",
    "print \"--- generated corresponding one-hot coding ---\"\n",
    "print\n",
    "\n",
    "if one_hot_HM_MM_clusters:\n",
    "    X = np.c_[X,hots.toarray()] # appends one-hot cluster vector to other features\n",
    "    for f in xrange(hots.shape[1]):\n",
    "        feature_names.append('cluster_%d'%f)\n",
    "\n",
    "y = np.asarray(y)\n",
    "yf = np.asarray(yf)\n",
    "\n",
    "S = read('template.vasp')\n",
    "wrap_structure(S)\n",
    "\n",
    "if 'struct_dict' not in locals():\n",
    "    struct_dict = dict()\n",
    "    print \"Processing adsorption structures\"\n",
    "    for i, (series, number, structure) in enumerate(struct_list):\n",
    "        if i and not i%1000:\n",
    "            print \"%.2f%% done\"%(i*100.0/len(struct_list))\n",
    "        structure.set_cell(cell)\n",
    "        structure.set_pbc(True)\n",
    "\n",
    "        struct_dict[(series,number)] = structure\n",
    "\n",
    "if 'struct_form_dict' not in locals():\n",
    "    struct_form_dict = dict()\n",
    "    print \"Processing slab (formation) structures\"\n",
    "    for i, (series, number, structureF) in enumerate(struct_form_list):\n",
    "        if i and not i%1000:\n",
    "            print \"%.2f%% done\"%(i*100.0/len(struct_list))\n",
    "\n",
    "        structureF.set_cell(cell)\n",
    "        structureF.set_pbc(True)\n",
    "\n",
    "        wrap_structure(structureF,s[0].position)\n",
    "\n",
    "        struct_form_dict[(series,number)] = structureF\n",
    "\n",
    "print \"--- COMPLETE ---\"\n",
    "\n",
    "# WARNING: I suspect that the order of points in src, srcf changes whenever the data is re-imported\n",
    "# (hence performing all data setup tasks up front)\n",
    "\n",
    "#### WARNING: 2 MINUTES\n",
    "### ALSO WARNING: stepdir is NOT normalized!! (it's norm is 8-ish ang.)\n",
    "%time disp_ads = np.asarray([ minimal_metal_distances(struct_dict[tuple(src[i])],S,1+16,1+16) for i in xrange(len(y))])\n",
    "dRM_ads = np.linalg.norm(disp_ads,axis=1)\n",
    "%time disp_slab = np.asarray([ minimal_metal_distances(struct_form_dict[tuple(srcf[i])],S,0+16,1+16) for i in xrange(len(y))])\n",
    "dRM_slab = np.linalg.norm(disp_slab,axis=1)\n",
    "%time disp_ads_slab = np.asarray([ minimal_metal_distances(struct_form_dict[tuple(srcf[i])],struct_dict[tuple(src[i])],16,1+16) for i in xrange(len(y))])\n",
    "dRM_ads_slab = np.linalg.norm(disp_ads_slab,axis=1)\n",
    "\n",
    "%time dRH = np.asarray([ np.linalg.norm(struct_dict[tuple(key)].positions[0]-s.positions[0]) for key in src])\n",
    "%time dRHs = np.asarray([ np.dot(struct_dict[tuple(key)].positions[0]-s.positions[0],stepdir) for key in src])\n",
    "dRHperp = np.sqrt(dRH**2-(dRHs/np.linalg.norm(stepdir))**2)\n",
    "\n",
    "%time initial_distances = [s.get_distances(0,[i+1 for i,b in enumerate(structure_to_key(struct_dict[tuple(key)])[0]) if b==1],mic=True) for key in src]\n",
    "%time final_distances = [struct_dict[tuple(key)].get_distances(0,[i+1 for i,b in enumerate(structure_to_key(struct_dict[tuple(key)])[0]) if b==1],mic=True) for key in src]\n",
    "\n",
    "%time top_layer_initial_distances = s.get_distances(0,range(49)[-16:],mic=True)\n",
    "%time top_layer_final_distances = np.asarray([struct_dict[tuple(key)].get_distances(0,range(len(s))[-16:],mic=True) for key in src])\n",
    "\n",
    "smallest_three = np.sort(top_layer_final_distances,axis=1)[:,:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# group1: structures with high slab reconstruction\n",
    "# group2: structures with high H atom displacement\n",
    "# criterion: the 97.5% of structures not falling into groups 1 and 2\n",
    "group1 = (dRM_ads_slab > 1.3)\n",
    "group2 = (dRHperp <= 0.6) + ((np.abs(dRHs/np.linalg.norm(stepdir))<=1.0)*(dRHperp <= 2.3)) # inverse; included points\n",
    "group2 = (~group2)*(~group1) # i.e. rejected by new criterion2 and not already in group1\n",
    "group3 = ((X[:,feature_names.index('MMproximity')]<3.5)*(X[:,feature_names.index('HMproximity')]<2.5))*(~group1)*(~group2)\n",
    "rest = (~group1)*(~group2)*(~group3)\n",
    "criterion = group3+rest # i.e. not excluded by reconstruction or H migration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Evaluate the oneHotFullEncode matrix, called the \"binary matrix\" in the paper for clarity\n",
    "\n",
    "#'includeSilver' option tells whether the one-hot for Ag should be included. 'elementOrder' gives the order that the elements will appear in the vector.\n",
    "includeSilver=True\n",
    "elementOrder = ['Ag','Cd','Zn','Au','Cu','Pt','Pd','Ni','Ir','Rh','Co','Ru','Fe','Re','Mn','W','Mo','Cr','Ta','V','Hf','Zr','Ti']\n",
    "oneHotConvRef = (16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 46, 45, 42, 43, 47, 39, 44, 41, 40)\n",
    "\n",
    "if includeSilver==False:\n",
    "    elementOrder.remove('Ag')\n",
    "oneHotRawList = [X[i,0:31] for i in range(len(X))]\n",
    "oneHotListFinal = oneHotRawList\n",
    "oneHotStrucLength = len(oneHotListFinal[0])\n",
    "zeros = [0] * oneHotStrucLength * len(elementOrder)\n",
    "oneHotFullEncode = np.array([zeros] * len(oneHotListFinal))\n",
    "whichEl = ['Xx'] * len(X)\n",
    "posEl = [0] * len(X)\n",
    "for i in range(0,len(X)):\n",
    "    #view(struct_dict[tuple(src[i])])\n",
    "    whichEl=symbol_from_structure(struct_dict[tuple(src[i])])\n",
    "    posEl=elementOrder.index(whichEl)\n",
    "    offset = posEl * oneHotStrucLength\n",
    "    oneHotFullEncode[i][offset:offset+oneHotStrucLength] = oneHotListFinal[i]\n",
    "    if includeSilver:\n",
    "        offset = 0\n",
    "        oneHotFullEncode[i][offset:offset+oneHotStrucLength] = np.abs(oneHotListFinal[i]-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
